1   /*
2    * Copyright (C) 2009 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.base;
18  
19  import static com.google.common.base.Preconditions.checkArgument;
20  import static com.google.common.base.Preconditions.checkNotNull;
21  
22  import com.google.common.annotations.Beta;
23  import com.google.common.annotations.GwtCompatible;
24  
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.Iterator;
28  import java.util.LinkedHashMap;
29  import java.util.List;
30  import java.util.Map;
31  
32  import javax.annotation.CheckReturnValue;
33  
34  /**
35   * Extracts non-overlapping substrings from an input string, typically by
36   * recognizing appearances of a <i>separator</i> sequence. This separator can be
37   * specified as a single {@linkplain #on(char) character}, fixed {@linkplain
38   * #on(String) string}, {@linkplain #onPattern regular expression} or {@link
39   * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at
40   * all, a splitter can extract adjacent substrings of a given {@linkplain
41   * #fixedLength fixed length}.
42   *
43   * <p>For example, this expression: <pre>   {@code
44   *
45   *   Splitter.on(',').split("foo,bar,qux")}</pre>
46   *
47   * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and
48   * {@code "qux"}, in that order.
49   *
50   * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The
51   * following expression: <pre>   {@code
52   *
53   *   Splitter.on(',').split(" foo,,,  bar ,")}</pre>
54   *
55   * ... yields the substrings {@code [" foo", "", "", "  bar ", ""]}. If this
56   * is not the desired behavior, use configuration methods to obtain a <i>new</i>
57   * splitter instance with modified behavior: <pre>   {@code
58   *
59   *   private static final Splitter MY_SPLITTER = Splitter.on(',')
60   *       .trimResults()
61   *       .omitEmptyStrings();}</pre>
62   *
63   * <p>Now {@code MY_SPLITTER.split("foo,,,  bar ,")} returns just {@code ["foo",
64   * "bar"]}. Note that the order in which these configuration methods are called
65   * is never significant.
66   *
67   * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration
68   * method has no effect on the receiving instance; you must store and use the
69   * new splitter instance it returns instead. <pre>   {@code
70   *
71   *   // Do NOT do this
72   *   Splitter splitter = Splitter.on('/');
73   *   splitter.trimResults(); // does nothing!
74   *   return splitter.split("wrong / wrong / wrong");}</pre>
75   *
76   * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an
77   * input string containing {@code n} occurrences of the separator naturally
78   * yields an iterable of size {@code n + 1}. So if the separator does not occur
79   * anywhere in the input, a single substring is returned containing the entire
80   * input. Consequently, all splitters split the empty string to {@code [""]}
81   * (note: even fixed-length splitters).
82   *
83   * <p>Splitter instances are thread-safe immutable, and are therefore safe to
84   * store as {@code static final} constants.
85   *
86   * <p>The {@link Joiner} class provides the inverse operation to splitting, but
87   * note that a round-trip between the two should be assumed to be lossy.
88   *
89   * <p>See the Guava User Guide article on <a href=
90   * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter">
91   * {@code Splitter}</a>.
92   *
93   * @author Julien Silland
94   * @author Jesse Wilson
95   * @author Kevin Bourrillion
96   * @author Louis Wasserman
97   * @since 1.0
98   */
99  @GwtCompatible(emulated = true)
100 public final class Splitter {
101   private final CharMatcher trimmer;
102   private final boolean omitEmptyStrings;
103   private final Strategy strategy;
104   private final int limit;
105 
106   private Splitter(Strategy strategy) {
107     this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
108   }
109 
110   private Splitter(Strategy strategy, boolean omitEmptyStrings,
111       CharMatcher trimmer, int limit) {
112     this.strategy = strategy;
113     this.omitEmptyStrings = omitEmptyStrings;
114     this.trimmer = trimmer;
115     this.limit = limit;
116   }
117 
118   /**
119    * Returns a splitter that uses the given single-character separator. For
120    * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
121    * containing {@code ["foo", "", "bar"]}.
122    *
123    * @param separator the character to recognize as a separator
124    * @return a splitter, with default settings, that recognizes that separator
125    */
126   public static Splitter on(char separator) {
127     return on(CharMatcher.is(separator));
128   }
129 
130   /**
131    * Returns a splitter that considers any single character matched by the
132    * given {@code CharMatcher} to be a separator. For example, {@code
133    * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
134    * iterable containing {@code ["foo", "", "bar", "quux"]}.
135    *
136    * @param separatorMatcher a {@link CharMatcher} that determines whether a
137    *     character is a separator
138    * @return a splitter, with default settings, that uses this matcher
139    */
140   public static Splitter on(final CharMatcher separatorMatcher) {
141     checkNotNull(separatorMatcher);
142 
143     return new Splitter(new Strategy() {
144       @Override public SplittingIterator iterator(
145           Splitter splitter, final CharSequence toSplit) {
146         return new SplittingIterator(splitter, toSplit) {
147           @Override int separatorStart(int start) {
148             return separatorMatcher.indexIn(toSplit, start);
149           }
150 
151           @Override int separatorEnd(int separatorPosition) {
152             return separatorPosition + 1;
153           }
154         };
155       }
156     });
157   }
158 
159   /**
160    * Returns a splitter that uses the given fixed string as a separator. For
161    * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an
162    * iterable containing {@code ["foo", "bar,baz"]}.
163    *
164    * @param separator the literal, nonempty string to recognize as a separator
165    * @return a splitter, with default settings, that recognizes that separator
166    */
167   public static Splitter on(final String separator) {
168     checkArgument(separator.length() != 0,
169         "The separator may not be the empty string.");
170 
171     return new Splitter(new Strategy() {
172       @Override public SplittingIterator iterator(
173           Splitter splitter, CharSequence toSplit) {
174         return new SplittingIterator(splitter, toSplit) {
175           @Override public int separatorStart(int start) {
176             int separatorLength = separator.length();
177 
178             positions:
179             for (int p = start, last = toSplit.length() - separatorLength;
180                 p <= last; p++) {
181               for (int i = 0; i < separatorLength; i++) {
182                 if (toSplit.charAt(i + p) != separator.charAt(i)) {
183                   continue positions;
184                 }
185               }
186               return p;
187             }
188             return -1;
189           }
190 
191           @Override public int separatorEnd(int separatorPosition) {
192             return separatorPosition + separator.length();
193           }
194         };
195       }
196     });
197   }
198 
199   /**
200    * Returns a splitter that divides strings into pieces of the given length.
201    * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
202    * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
203    * smaller than {@code length} but will never be empty.
204    *
205    * <p><b>Exception:</b> for consistency with separator-based splitters, {@code
206    * split("")} does not yield an empty iterable, but an iterable containing
207    * {@code ""}. This is the only case in which {@code
208    * Iterables.size(split(input))} does not equal {@code
209    * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior,
210    * use {@code omitEmptyStrings}.
211    *
212    * @param length the desired length of pieces after splitting, a positive
213    *     integer
214    * @return a splitter, with default settings, that can split into fixed sized
215    *     pieces
216    * @throws IllegalArgumentException if {@code length} is zero or negative
217    */
218   public static Splitter fixedLength(final int length) {
219     checkArgument(length > 0, "The length may not be less than 1");
220 
221     return new Splitter(new Strategy() {
222       @Override public SplittingIterator iterator(
223           final Splitter splitter, CharSequence toSplit) {
224         return new SplittingIterator(splitter, toSplit) {
225           @Override public int separatorStart(int start) {
226             int nextChunkStart = start + length;
227             return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
228           }
229 
230           @Override public int separatorEnd(int separatorPosition) {
231             return separatorPosition;
232           }
233         };
234       }
235     });
236   }
237 
238   /**
239    * Returns a splitter that behaves equivalently to {@code this} splitter, but
240    * automatically omits empty strings from the results. For example, {@code
241    * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
242    * iterable containing only {@code ["a", "b", "c"]}.
243    *
244    * <p>If either {@code trimResults} option is also specified when creating a
245    * splitter, that splitter always trims results first before checking for
246    * emptiness. So, for example, {@code
247    * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
248    * an empty iterable.
249    *
250    * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
251    * to return an empty iterable, but when using this option, it can (if the
252    * input sequence consists of nothing but separators).
253    *
254    * @return a splitter with the desired configuration
255    */
256   @CheckReturnValue
257   public Splitter omitEmptyStrings() {
258     return new Splitter(strategy, true, trimmer, limit);
259   }
260 
261   /**
262    * Returns a splitter that behaves equivalently to {@code this} splitter but
263    * stops splitting after it reaches the limit.
264    * The limit defines the maximum number of items returned by the iterator.
265    *
266    * <p>For example,
267    * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
268    * containing {@code ["a", "b", "c,d"]}.  When omitting empty strings, the
269    * omitted strings do no count.  Hence,
270    * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
271    * returns an iterable containing {@code ["a", "b", "c,d"}.
272    * When trim is requested, all entries, including the last are trimmed.  Hence
273    * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
274    * results in @{code ["a", "b", "c , d"]}.
275    *
276    * @param limit the maximum number of items returns
277    * @return a splitter with the desired configuration
278    * @since 9.0
279    */
280   @CheckReturnValue
281   public Splitter limit(int limit) {
282     checkArgument(limit > 0, "must be greater than zero: %s", limit);
283     return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
284   }
285 
286   /**
287    * Returns a splitter that behaves equivalently to {@code this} splitter, but
288    * automatically removes leading and trailing {@linkplain
289    * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
290    * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
291    * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
292    * containing {@code ["a", "b", "c"]}.
293    *
294    * @return a splitter with the desired configuration
295    */
296   @CheckReturnValue
297   public Splitter trimResults() {
298     return trimResults(CharMatcher.WHITESPACE);
299   }
300 
301   /**
302    * Returns a splitter that behaves equivalently to {@code this} splitter, but
303    * removes all leading or trailing characters matching the given {@code
304    * CharMatcher} from each returned substring. For example, {@code
305    * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
306    * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
307    *
308    * @param trimmer a {@link CharMatcher} that determines whether a character
309    *     should be removed from the beginning/end of a subsequence
310    * @return a splitter with the desired configuration
311    */
312   // TODO(kevinb): throw if a trimmer was already specified!
313   @CheckReturnValue
314   public Splitter trimResults(CharMatcher trimmer) {
315     checkNotNull(trimmer);
316     return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
317   }
318 
319   /**
320    * Splits {@code sequence} into string components and makes them available
321    * through an {@link Iterator}, which may be lazily evaluated. If you want
322    * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}.
323    *
324    * @param sequence the sequence of characters to split
325    * @return an iteration over the segments split from the parameter.
326    */
327   public Iterable<String> split(final CharSequence sequence) {
328     checkNotNull(sequence);
329 
330     return new Iterable<String>() {
331       @Override public Iterator<String> iterator() {
332         return splittingIterator(sequence);
333       }
334       @Override public String toString() {
335         return Joiner.on(", ")
336             .appendTo(new StringBuilder().append('['), this)
337             .append(']')
338             .toString();
339       }
340     };
341   }
342 
343   private Iterator<String> splittingIterator(CharSequence sequence) {
344     return strategy.iterator(this, sequence);
345   }
346 
347   /**
348    * Splits {@code sequence} into string components and returns them as
349    * an immutable list. If you want an {@link Iterable} which may be lazily
350    * evaluated, use {@link #split(CharSequence)}.
351    *
352    * @param sequence the sequence of characters to split
353    * @return an immutable list of the segments split from the parameter
354    * @since 15.0
355    */
356   @Beta
357   public List<String> splitToList(CharSequence sequence) {
358     checkNotNull(sequence);
359 
360     Iterator<String> iterator = splittingIterator(sequence);
361     List<String> result = new ArrayList<String>();
362 
363     while (iterator.hasNext()) {
364       result.add(iterator.next());
365     }
366 
367     return Collections.unmodifiableList(result);
368   }
369 
370   /**
371    * Returns a {@code MapSplitter} which splits entries based on this splitter,
372    * and splits entries into keys and values using the specified separator.
373    *
374    * @since 10.0
375    */
376   @CheckReturnValue
377   @Beta
378   public MapSplitter withKeyValueSeparator(String separator) {
379     return withKeyValueSeparator(on(separator));
380   }
381 
382   /**
383    * Returns a {@code MapSplitter} which splits entries based on this splitter,
384    * and splits entries into keys and values using the specified separator.
385    *
386    * @since 14.0
387    */
388   @CheckReturnValue
389   @Beta
390   public MapSplitter withKeyValueSeparator(char separator) {
391     return withKeyValueSeparator(on(separator));
392   }
393 
394   /**
395    * Returns a {@code MapSplitter} which splits entries based on this splitter,
396    * and splits entries into keys and values using the specified key-value
397    * splitter.
398    *
399    * @since 10.0
400    */
401   @CheckReturnValue
402   @Beta
403   public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
404     return new MapSplitter(this, keyValueSplitter);
405   }
406 
407   /**
408    * An object that splits strings into maps as {@code Splitter} splits
409    * iterables and lists. Like {@code Splitter}, it is thread-safe and
410    * immutable.
411    *
412    * @since 10.0
413    */
414   @Beta
415   public static final class MapSplitter {
416     private static final String INVALID_ENTRY_MESSAGE =
417         "Chunk [%s] is not a valid entry";
418     private final Splitter outerSplitter;
419     private final Splitter entrySplitter;
420 
421     private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
422       this.outerSplitter = outerSplitter; // only "this" is passed
423       this.entrySplitter = checkNotNull(entrySplitter);
424     }
425 
426     /**
427      * Splits {@code sequence} into substrings, splits each substring into
428      * an entry, and returns an unmodifiable map with each of the entries. For
429      * example, <code>
430      * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
431      * .split("a=>b ; c=>b")
432      * </code> will return a mapping from {@code "a"} to {@code "b"} and
433      * {@code "c"} to {@code b}.
434      *
435      * <p>The returned map preserves the order of the entries from
436      * {@code sequence}.
437      *
438      * @throws IllegalArgumentException if the specified sequence does not split
439      *         into valid map entries, or if there are duplicate keys
440      */
441     public Map<String, String> split(CharSequence sequence) {
442       Map<String, String> map = new LinkedHashMap<String, String>();
443       for (String entry : outerSplitter.split(sequence)) {
444         Iterator<String> entryFields = entrySplitter.splittingIterator(entry);
445 
446         checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
447         String key = entryFields.next();
448         checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
449 
450         checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
451         String value = entryFields.next();
452         map.put(key, value);
453 
454         checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
455       }
456       return Collections.unmodifiableMap(map);
457     }
458   }
459 
460   private interface Strategy {
461     Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
462   }
463 
464   private abstract static class SplittingIterator extends AbstractIterator<String> {
465     final CharSequence toSplit;
466     final CharMatcher trimmer;
467     final boolean omitEmptyStrings;
468 
469     /**
470      * Returns the first index in {@code toSplit} at or after {@code start}
471      * that contains the separator.
472      */
473     abstract int separatorStart(int start);
474 
475     /**
476      * Returns the first index in {@code toSplit} after {@code
477      * separatorPosition} that does not contain a separator. This method is only
478      * invoked after a call to {@code separatorStart}.
479      */
480     abstract int separatorEnd(int separatorPosition);
481 
482     int offset = 0;
483     int limit;
484 
485     protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
486       this.trimmer = splitter.trimmer;
487       this.omitEmptyStrings = splitter.omitEmptyStrings;
488       this.limit = splitter.limit;
489       this.toSplit = toSplit;
490     }
491 
492     @Override protected String computeNext() {
493       /*
494        * The returned string will be from the end of the last match to the
495        * beginning of the next one. nextStart is the start position of the
496        * returned substring, while offset is the place to start looking for a
497        * separator.
498        */
499       int nextStart = offset;
500       while (offset != -1) {
501         int start = nextStart;
502         int end;
503 
504         int separatorPosition = separatorStart(offset);
505         if (separatorPosition == -1) {
506           end = toSplit.length();
507           offset = -1;
508         } else {
509           end = separatorPosition;
510           offset = separatorEnd(separatorPosition);
511         }
512         if (offset == nextStart) {
513           /*
514            * This occurs when some pattern has an empty match, even if it
515            * doesn't match the empty string -- for example, if it requires
516            * lookahead or the like. The offset must be increased to look for
517            * separators beyond this point, without changing the start position
518            * of the next returned substring -- so nextStart stays the same.
519            */
520           offset++;
521           if (offset >= toSplit.length()) {
522             offset = -1;
523           }
524           continue;
525         }
526 
527         while (start < end && trimmer.matches(toSplit.charAt(start))) {
528           start++;
529         }
530         while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
531           end--;
532         }
533 
534         if (omitEmptyStrings && start == end) {
535           // Don't include the (unused) separator in next split string.
536           nextStart = offset;
537           continue;
538         }
539 
540         if (limit == 1) {
541           // The limit has been reached, return the rest of the string as the
542           // final item.  This is tested after empty string removal so that
543           // empty strings do not count towards the limit.
544           end = toSplit.length();
545           offset = -1;
546           // Since we may have changed the end, we need to trim it again.
547           while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
548             end--;
549           }
550         } else {
551           limit--;
552         }
553 
554         return toSplit.subSequence(start, end).toString();
555       }
556       return endOfData();
557     }
558   }
559 }
560